library(SummarizedExperiment)
## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unlist, unsplit
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(edgeR)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:IRanges':
##
## collapse
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
##
## Attaching package: 'genefilter'
## The following object is masked from 'package:base':
##
## anyNA
library(ggplot2)
library(geneplotter)
## Loading required package: lattice
## Loading required package: annotate
## Loading required package: AnnotationDbi
## Loading required package: XML
We load the raw RNA-seq data counts set of Kidney renal clear-cell-carcinoma.
se <- readRDS(file.path("~/Documentos/MASTER/IEO/KIDNEY/seKIRC.rds"))
se
## class: RangedSummarizedExperiment
## dim: 20115 614
## metadata(5): experimentData annotation cancerTypeCode
## cancerTypeDescription objectCreationDate
## assays(1): counts
## rownames(20115): 1 2 ... 102724473 103091865
## rowRanges metadata column names(3): symbol txlen txgc
## colnames(614): TCGA.3Z.A93Z.01A.11R.A37O.07
## TCGA.6D.AA2E.01A.11R.A37O.07 ... TCGA.CZ.5988.11A.01R.1672.07
## TCGA.CZ.5989.11A.01R.1672.07
## colData names(549): type bcr_patient_uuid ...
## lymph_nodes_aortic_pos_by_ihc lymph_nodes_aortic_pos_total
We create a DGEList’ object to hold the dataset to be analysed in a better and more comprehensive way.
library(edgeR)
dge <- DGEList(counts=assays(se)$counts, genes=mcols(se))
## Warning in as.data.frame.DataFrame(genes, stringsAsFactors = FALSE):
## Arguments in '...' ignored
Moreover, we calculate \(\log_2\) CPM values of expression and we save them in an assay element.
assays(se)$logCPM <- cpm(dge, log=TRUE, prior.count=0.5)
At this point, we need to determine if the library sizes of tumor samples and normal samples are similar or not.
libsize <- data.frame(libsize = dge$sample$lib.size/1e6, type = se$type)
summary(libsize)
## libsize type
## Min. : 3.049 normal: 72
## 1st Qu.: 38.075 tumor :542
## Median : 46.715
## Mean : 46.679
## 3rd Qu.: 54.647
## Max. :115.546
ggplot(libsize) +
geom_density(aes(x=libsize, fill=type), alpha=0.5) +
ylab("density\n") + xlab("\nMillions of Reads") +
theme_bw()
ggplot(libsize) +
geom_histogram(
aes(x=libsize, fill=type, y = (..count..)/sum(..count..)), alpha=0.5, binwidth=2
) + xlab("\nMillions of Reads") + ylab("% of Samples\n") + theme_bw()
dge.filt <- dge[, dge$sample$lib.size/1e6 > 45 ]
final.samples <- rownames(dge.filt$samples)
se.filt <-se[,colnames(se) %in% final.samples]
se.normal <- se.filt[,se.filt$type == "normal"]
se.tumor <- se.filt[,se.filt$type == "tumor"]
normal.code <- substr(colnames(se.normal), 9, 12)
tumor.code <- substr(colnames(se.tumor), 9, 12)
common.codes <- intersect(normal.code, tumor.code)
length(common.codes)
## [1] 38
se.paired <- se.filt[,substr(colnames(se.filt), 9, 12) %in% common.codes]
summary(se.paired$type)
## normal tumor
## 38 38
colData(se.paired)$samplecodes <- substr(colnames(se.paired), 9, 12)
dge.filt <- DGEList(counts=assays(se.paired)$counts, genes=mcols(se.paired))
## Warning in as.data.frame.DataFrame(genes, stringsAsFactors = FALSE):
## Arguments in '...' ignored
dge.filt
## An object of class "DGEList"
## $counts
## TCGA.B0.4712.01A.01R.1503.07 TCGA.B0.5402.01A.01R.1503.07
## 1 171 52
## 2 72089 254994
## 9 0 0
## 10 0 0
## 12 1488 1894
## TCGA.B0.5690.01A.11R.1541.07 TCGA.B0.5705.01A.11R.1541.07
## 1 129 91
## 2 214306 141888
## 9 0 0
## 10 0 0
## 12 1949 2848
## TCGA.B0.5709.01A.11R.1541.07 TCGA.B0.5711.01A.11R.1672.07
## 1 182 43
## 2 212776 193205
## 9 0 0
## 10 0 0
## 12 2491 2067
## TCGA.B0.5712.01A.11R.1672.07 TCGA.B2.5636.01A.02R.1541.07
## 1 15 221
## 2 58923 162102
## 9 0 0
## 10 0 0
## 12 1731 2464
## TCGA.B8.5549.01A.01R.1541.07 TCGA.B8.5552.01B.11R.1672.07
## 1 67 124
## 2 148006 250803
## 9 0 0
## 10 0 0
## 12 1593 2740
## TCGA.CJ.5680.01A.11R.1541.07 TCGA.CJ.6030.01A.11R.1672.07
## 1 82 88
## 2 74680 119037
## 9 0 0
## 10 0 0
## 12 2291 1876
## TCGA.CW.5580.01A.01R.1672.07 TCGA.CW.5585.01A.01R.1541.07
## 1 73 55
## 2 214475 137990
## 9 0 0
## 10 0 0
## 12 3628 2699
## TCGA.CW.5587.01A.01R.1541.07 TCGA.CW.5589.01A.01R.1541.07
## 1 283 104
## 2 147003 161231
## 9 0 0
## 10 0 0
## 12 2481 1940
## TCGA.CW.5591.01A.01R.1541.07 TCGA.CW.6087.01A.11R.1672.07
## 1 48 254
## 2 141961 44768
## 9 0 0
## 10 0 0
## 12 1799 2399
## TCGA.CW.6088.01A.11R.1672.07 TCGA.CW.6090.01A.11R.1672.07
## 1 36 91
## 2 200085 120305
## 9 0 0
## 10 0 0
## 12 3484 1952
## TCGA.CZ.4863.01A.01R.1503.07 TCGA.CZ.4864.01A.01R.1503.07
## 1 97 232
## 2 80757 121483
## 9 0 0
## 10 0 0
## 12 2495 3438
## TCGA.CZ.4865.01A.02R.1503.07 TCGA.CZ.5452.01A.01R.1503.07
## 1 141 192
## 2 196075 90934
## 9 0 0
## 10 0 0
## 12 2491 1814
## TCGA.CZ.5453.01A.01R.1503.07 TCGA.CZ.5454.01A.01R.1503.07
## 1 31 130
## 2 122695 190754
## 9 0 0
## 10 0 0
## 12 1631 1745
## TCGA.CZ.5455.01A.01R.1503.07 TCGA.CZ.5457.01A.01R.1503.07
## 1 43 427
## 2 277893 130344
## 9 0 0
## 10 0 0
## 12 2907 1429
## TCGA.CZ.5458.01A.01R.1503.07 TCGA.CZ.5461.01A.01R.1503.07
## 1 121 128
## 2 210354 183064
## 9 0 0
## 10 0 0
## 12 2061 2368
## TCGA.CZ.5462.01A.01R.1503.07 TCGA.CZ.5463.01A.01R.1503.07
## 1 47 46
## 2 53290 160439
## 9 0 0
## 10 0 0
## 12 2032 3366
## TCGA.CZ.5465.01A.01R.1503.07 TCGA.CZ.5467.01A.01R.1503.07
## 1 60 94
## 2 151711 187303
## 9 0 0
## 10 0 0
## 12 3780 2391
## TCGA.CZ.5468.01A.01R.1503.07 TCGA.CZ.5469.01A.01R.1503.07
## 1 315 441
## 2 84264 48436
## 9 0 0
## 10 0 0
## 12 1944 2420
## TCGA.CZ.5470.01A.01R.1503.07 TCGA.CZ.5984.01A.11R.1672.07
## 1 161 346
## 2 101134 107521
## 9 0 0
## 10 0 0
## 12 1860 2169
## TCGA.B0.4712.11A.02R.1503.07 TCGA.B0.5402.11A.01R.1503.07
## 1 125 129
## 2 53444 212158
## 9 0 0
## 10 0 0
## 12 2611 1831
## TCGA.B0.5690.11A.01R.1541.07 TCGA.B0.5705.11A.01R.1541.07
## 1 48 75
## 2 44253 94956
## 9 0 0
## 10 0 0
## 12 2159 2375
## TCGA.B0.5709.11A.01R.1541.07 TCGA.B0.5711.11A.01R.1672.07
## 1 100 98
## 2 61381 81181
## 9 0 0
## 10 0 0
## 12 2074 2678
## TCGA.B0.5712.11A.01R.1672.07 TCGA.B2.5636.11A.01R.1541.07
## 1 94 63
## 2 165423 88791
## 9 0 0
## 10 0 0
## 12 1845 2033
## TCGA.B8.5549.11A.01R.1541.07 TCGA.B8.5552.11A.01R.1672.07
## 1 62 66
## 2 68264 122109
## 9 0 0
## 10 0 0
## 12 2297 1671
## TCGA.CJ.5680.11A.01R.1541.07 TCGA.CJ.6030.11A.01R.1672.07
## 1 71 61
## 2 85768 66273
## 9 0 0
## 10 0 0
## 12 2094 1644
## TCGA.CW.5580.11A.02R.1672.07 TCGA.CW.5585.11A.01R.1541.07
## 1 139 75
## 2 162520 111561
## 9 0 0
## 10 0 0
## 12 2849 2009
## TCGA.CW.5587.11A.01R.1541.07 TCGA.CW.5589.11A.01R.1541.07
## 1 64 53
## 2 57992 58145
## 9 0 0
## 10 0 0
## 12 2005 1909
## TCGA.CW.5591.11A.01R.1541.07 TCGA.CW.6087.11A.01R.1672.07
## 1 71 64
## 2 268276 50936
## 9 0 0
## 10 0 0
## 12 3544 2215
## TCGA.CW.6088.11A.01R.1672.07 TCGA.CW.6090.11A.01R.1672.07
## 1 37 52
## 2 124218 54273
## 9 0 0
## 10 0 0
## 12 3581 2060
## TCGA.CZ.4863.11A.01R.1503.07 TCGA.CZ.4864.11A.01R.1503.07
## 1 142 43
## 2 100202 100861
## 9 0 0
## 10 0 0
## 12 2499 2015
## TCGA.CZ.4865.11A.01R.1503.07 TCGA.CZ.5452.11A.01R.1503.07
## 1 87 81
## 2 133636 57375
## 9 0 0
## 10 0 0
## 12 2079 2036
## TCGA.CZ.5453.11A.01R.1503.07 TCGA.CZ.5454.11A.01R.1503.07
## 1 63 124
## 2 129106 160618
## 9 0 0
## 10 0 1
## 12 2225 3201
## TCGA.CZ.5455.11A.01R.1503.07 TCGA.CZ.5457.11A.01R.1503.07
## 1 65 165
## 2 132123 91025
## 9 0 0
## 10 0 0
## 12 2145 2274
## TCGA.CZ.5458.11A.01R.1503.07 TCGA.CZ.5461.11A.01R.1503.07
## 1 105 78
## 2 125882 114075
## 9 0 0
## 10 0 0
## 12 2349 2264
## TCGA.CZ.5462.11A.01R.1503.07 TCGA.CZ.5463.11A.01R.1503.07
## 1 109 84
## 2 107009 94839
## 9 0 0
## 10 0 0
## 12 3000 2682
## TCGA.CZ.5465.11A.01R.1503.07 TCGA.CZ.5467.11A.01R.1503.07
## 1 137 138
## 2 71648 96987
## 9 0 0
## 10 0 0
## 12 2802 1589
## TCGA.CZ.5468.11A.01R.1503.07 TCGA.CZ.5469.11A.01R.1503.07
## 1 156 109
## 2 115785 95492
## 9 0 0
## 10 0 0
## 12 2083 2016
## TCGA.CZ.5470.11A.01R.1503.07 TCGA.CZ.5984.11A.01R.1672.07
## 1 133 77
## 2 175437 103163
## 9 0 0
## 10 0 0
## 12 2906 1845
## 20110 more rows ...
##
## $samples
## group lib.size norm.factors
## TCGA.B0.4712.01A.01R.1503.07 1 49725026 1
## TCGA.B0.5402.01A.01R.1503.07 1 62640283 1
## TCGA.B0.5690.01A.11R.1541.07 1 45633765 1
## TCGA.B0.5705.01A.11R.1541.07 1 60479323 1
## TCGA.B0.5709.01A.11R.1541.07 1 58992784 1
## 71 more rows ...
##
## $genes
## symbol txlen txgc
## 1 A1BG 3322 0.5644190
## 2 A2M 4844 0.4882329
## 3 NAT1 2280 0.3942982
## 4 NAT2 1322 0.3895613
## 5 SERPINA3 3067 0.5249429
## 20110 more rows ...
Figure S1: Library sizes in increasing order.
par(mfrow=c(1,2), mar=c(4,5,1,1))
In the next part, we take a look on the distribution of expression values per sample in terms of logarithmic CPM units.
multidensity(as.list(as.data.frame(assays(se.paired)$logCPM)), xlab = "log2 CPM", legend = NULL, main = "All samples", cex.axis = 1.2, cex.lab = 1.5, las = 1)
We cannot observe significant diferences between samples in terms of expression levels.
As we have more than 200 samples, we now display the normal and tumor distribution separately.
normalCPM <- se.paired[,se.paired$type == "normal"]
tumorCPM <- se.paired[,se.paired$type == "tumor"]
multidensity(as.list(as.data.frame(assays(normalCPM)$logCPM)), xlab = "log2 CPM", legend = NULL, main = "Normal", cex.axis = 1.2, cex.lab = 1.5, las = 1)
multidensity(as.list(as.data.frame(assays(tumorCPM)$logCPM)), xlab = "log2 CPM", legend = NULL, main = "Tumor", cex.axis = 1.2, cex.lab = 1.5, las = 1)
Now, we want to filter out the genes with a low expression. To do that, we can plot the CPMs in logarithmic scale.
genemean <- data.frame(Means=rowMeans(assays(se.paired)$logCPM[,-1]))
ggplot(genemean) +
geom_bar(aes(x=Means), binwidth = 1, fill="white", color="black") +
theme_bw() + xlab("\nlog2CPM") + ylab("Count\n") +
geom_vline(xintercept=mean(genemean$Means), color="red")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
We can see how there are two peaks of genes. The first one correspond to these genes with a very low expression and thus we decide to remove them. In other to do that, we used the mean as a threshold.
avgexp <- rowMeans(assays(se.paired)$logCPM)
mask <- avgexp > mean(genemean$Means)
These are the numbers of genes before filtering:
dim(se.paired) # Before filtering SE
## [1] 20115 76
dim(dge.filt) # Before filtering DGE
## [1] 20115 76
These are the numbers of samples genes after filtering:
se.paired.genes <- se.paired[mask, ]
dim(se.paired.genes) # After filtering SE
## [1] 11413 76
dge.filt <- dge.filt[mask, ]
dim(dge.filt) # After filtering DGE
## [1] 11413 76
At this point, we can calculate the normaliation factors on the filtered expression data set.
dgenorm <- calcNormFactors(dge.filt)
assays(se.paired.genes)$logCPMnorm <- cpm(dgenorm, log=TRUE, prior.count=0.5)
We look at the MA-plots of the normal samples to see if there are systematic biases in gene expression levels in any of the sampes. We expect the slope to be around zero.
Figure S4: MA-plots of the normal samples.
Figure S4: MA-plots of the normal samples.
As we can see, we don’t see any samples with major biases in gene expression.
Now, we examine the tumor samples:
Figure S4: MA-plots of the tumor samples.
Now we’re going to analyze our data in order to search for batch effect that could interfere with the biological signal. First, we analyze some of the information contained in the barcode, such as tissue source site, center of sequenciation, plate and portion and analyte combinations. We use two approaches to try to identify batch effect, the Hierarchical Clustering and a Multidimensional Scaling plot.
tss <- substr(colnames(se.paired.genes), 6, 7)
table(tss)
## tss
## B0 B2 B8 CJ CW CZ
## 14 2 4 4 16 36
center <- substr(colnames(se.paired.genes), 27, 28)
table(center)
## center
## 07
## 76
plate <- substr(colnames(se.paired.genes), 22, 25)
table(plate)
## plate
## 1503 1541 1672
## 38 20 18
portionanalyte <- substr(colnames(se.paired.genes), 18, 20)
table(portionanalyte)
## portionanalyte
## 01R 02R 11R
## 60 4 12
samplevial <- substr(colnames(se.paired.genes), 14, 16)
table(samplevial)
## samplevial
## 01A 01B 11A
## 37 1 38
table(data.frame(TYPE=se.paired.genes$type, GENDER=se.paired.genes$gender))
## GENDER
## TYPE FEMALE MALE
## normal 13 25
## tumor 13 25
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.paired.genes$gender))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TSS", sort(unique(batch)), levels(factor(se.paired.genes$gender))), fill=sort(unique(batch)))
Figure S6: Hierarchical clustering of the samples.
plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("Gender", sort(unique(batch)), levels(factor(se.paired.genes$gender))),
fill=sort(unique(batch)), inset=0.05)
Figure S7: Multidimensional scaling plot of the samples.
table(data.frame(TYPE=se.paired.genes$type, TSS=tss))
## TSS
## TYPE B0 B2 B8 CJ CW CZ
## normal 7 1 2 2 8 18
## tumor 7 1 2 2 8 18
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(tss))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TSS", sort(unique(batch)), levels(factor(tss))), fill=sort(unique(batch)))
Figure S6: Hierarchical clustering of the samples.
plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("TSS", sort(unique(batch)), levels(factor(tss))),
fill=sort(unique(batch)), inset=0.05)
Figure S7: Multidimensional scaling plot of the samples.
table(data.frame(TYPE=se.paired.genes$type, PLATE=plate))
## PLATE
## TYPE 1503 1541 1672
## normal 19 10 9
## tumor 19 10 9
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(plate))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("PLATE", sort(unique(batch)), levels(factor(plate))), fill=sort(unique(batch)))
Figure S6: Hierarchical clustering of the samples.
plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("plate", sort(unique(batch)), levels(factor(plate))),
fill=sort(unique(batch)), inset=0.05)
Figure S7: Multidimensional scaling plot of the samples.
table(data.frame(TYPE=se.paired.genes$type, P.analyte = portionanalyte))
## P.analyte
## TYPE 01R 02R 11R
## normal 36 2 0
## tumor 24 2 12
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(portionanalyte))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("Portionanalyte", sort(unique(batch)), levels(factor(portionanalyte))), fill=sort(unique(batch)))
Figure S6: Hierarchical clustering of the samples.
plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("Portionanalyte", sort(unique(batch)), levels(factor(portionanalyte))),
fill=sort(unique(batch)), inset=0.05)
Figure S7: Multidimensional scaling plot of the samples.
design <- model.matrix(~type + samplecodes, data = colData(se.paired.genes))
summary(design)
## (Intercept) typetumor samplecodes4863 samplecodes4864
## Min. :1 Min. :0.0 Min. :0.00000 Min. :0.00000
## 1st Qu.:1 1st Qu.:0.0 1st Qu.:0.00000 1st Qu.:0.00000
## Median :1 Median :0.5 Median :0.00000 Median :0.00000
## Mean :1 Mean :0.5 Mean :0.02632 Mean :0.02632
## 3rd Qu.:1 3rd Qu.:1.0 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1 Max. :1.0 Max. :1.00000 Max. :1.00000
## samplecodes4865 samplecodes5402 samplecodes5452 samplecodes5453
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## samplecodes5454 samplecodes5455 samplecodes5457 samplecodes5458
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## samplecodes5461 samplecodes5462 samplecodes5463 samplecodes5465
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## samplecodes5467 samplecodes5468 samplecodes5469 samplecodes5470
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## samplecodes5549 samplecodes5552 samplecodes5580 samplecodes5585
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## samplecodes5587 samplecodes5589 samplecodes5591 samplecodes5636
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## samplecodes5680 samplecodes5690 samplecodes5705 samplecodes5709
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## samplecodes5711 samplecodes5712 samplecodes5984 samplecodes6030
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## samplecodes6087 samplecodes6088 samplecodes6090
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.02632 Mean :0.02632 Mean :0.02632
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000
dgenorm
## An object of class "DGEList"
## $counts
## TCGA.B0.4712.01A.01R.1503.07 TCGA.B0.5402.01A.01R.1503.07
## 2 72089 254994
## 12 1488 1894
## 14 8130 13499
## 16 15743 8655
## 18 285 5697
## TCGA.B0.5690.01A.11R.1541.07 TCGA.B0.5705.01A.11R.1541.07
## 2 214306 141888
## 12 1949 2848
## 14 5934 10251
## 16 7642 12714
## 18 821 6150
## TCGA.B0.5709.01A.11R.1541.07 TCGA.B0.5711.01A.11R.1672.07
## 2 212776 193205
## 12 2491 2067
## 14 8520 6996
## 16 11427 8106
## 18 2424 781
## TCGA.B0.5712.01A.11R.1672.07 TCGA.B2.5636.01A.02R.1541.07
## 2 58923 162102
## 12 1731 2464
## 14 9469 5671
## 16 8175 12293
## 18 2964 483
## TCGA.B8.5549.01A.01R.1541.07 TCGA.B8.5552.01B.11R.1672.07
## 2 148006 250803
## 12 1593 2740
## 14 4861 5906
## 16 12127 9849
## 18 880 692
## TCGA.CJ.5680.01A.11R.1541.07 TCGA.CJ.6030.01A.11R.1672.07
## 2 74680 119037
## 12 2291 1876
## 14 9380 9306
## 16 15652 9259
## 18 7623 1214
## TCGA.CW.5580.01A.01R.1672.07 TCGA.CW.5585.01A.01R.1541.07
## 2 214475 137990
## 12 3628 2699
## 14 8573 9558
## 16 10712 8662
## 18 871 1005
## TCGA.CW.5587.01A.01R.1541.07 TCGA.CW.5589.01A.01R.1541.07
## 2 147003 161231
## 12 2481 1940
## 14 10829 6297
## 16 11349 7001
## 18 3171 3062
## TCGA.CW.5591.01A.01R.1541.07 TCGA.CW.6087.01A.11R.1672.07
## 2 141961 44768
## 12 1799 2399
## 14 4985 7436
## 16 9385 21165
## 18 5870 1801
## TCGA.CW.6088.01A.11R.1672.07 TCGA.CW.6090.01A.11R.1672.07
## 2 200085 120305
## 12 3484 1952
## 14 7859 8142
## 16 10274 8952
## 18 4186 608
## TCGA.CZ.4863.01A.01R.1503.07 TCGA.CZ.4864.01A.01R.1503.07
## 2 80757 121483
## 12 2495 3438
## 14 8369 9499
## 16 10205 11632
## 18 1810 1263
## TCGA.CZ.4865.01A.02R.1503.07 TCGA.CZ.5452.01A.01R.1503.07
## 2 196075 90934
## 12 2491 1814
## 14 11115 7968
## 16 12748 9549
## 18 1245 1826
## TCGA.CZ.5453.01A.01R.1503.07 TCGA.CZ.5454.01A.01R.1503.07
## 2 122695 190754
## 12 1631 1745
## 14 5293 8773
## 16 12352 9966
## 18 4484 626
## TCGA.CZ.5455.01A.01R.1503.07 TCGA.CZ.5457.01A.01R.1503.07
## 2 277893 130344
## 12 2907 1429
## 14 9900 9737
## 16 13560 15042
## 18 1230 3262
## TCGA.CZ.5458.01A.01R.1503.07 TCGA.CZ.5461.01A.01R.1503.07
## 2 210354 183064
## 12 2061 2368
## 14 8476 13330
## 16 8138 12106
## 18 618 504
## TCGA.CZ.5462.01A.01R.1503.07 TCGA.CZ.5463.01A.01R.1503.07
## 2 53290 160439
## 12 2032 3366
## 14 8986 11367
## 16 13632 12872
## 18 1178 2910
## TCGA.CZ.5465.01A.01R.1503.07 TCGA.CZ.5467.01A.01R.1503.07
## 2 151711 187303
## 12 3780 2391
## 14 13893 7658
## 16 22800 12269
## 18 507 676
## TCGA.CZ.5468.01A.01R.1503.07 TCGA.CZ.5469.01A.01R.1503.07
## 2 84264 48436
## 12 1944 2420
## 14 5298 7583
## 16 22086 7607
## 18 154 464
## TCGA.CZ.5470.01A.01R.1503.07 TCGA.CZ.5984.01A.11R.1672.07
## 2 101134 107521
## 12 1860 2169
## 14 9693 8567
## 16 9597 11801
## 18 895 1526
## TCGA.B0.4712.11A.02R.1503.07 TCGA.B0.5402.11A.01R.1503.07
## 2 53444 212158
## 12 2611 1831
## 14 9926 7167
## 16 12248 11301
## 18 1700 9533
## TCGA.B0.5690.11A.01R.1541.07 TCGA.B0.5705.11A.01R.1541.07
## 2 44253 94956
## 12 2159 2375
## 14 8329 6731
## 16 12061 13426
## 18 19929 7635
## TCGA.B0.5709.11A.01R.1541.07 TCGA.B0.5711.11A.01R.1672.07
## 2 61381 81181
## 12 2074 2678
## 14 7668 8945
## 16 11075 12953
## 18 25920 18655
## TCGA.B0.5712.11A.01R.1672.07 TCGA.B2.5636.11A.01R.1541.07
## 2 165423 88791
## 12 1845 2033
## 14 6498 6647
## 16 9655 11014
## 18 3171 24671
## TCGA.B8.5549.11A.01R.1541.07 TCGA.B8.5552.11A.01R.1672.07
## 2 68264 122109
## 12 2297 1671
## 14 7042 4443
## 16 11353 7354
## 18 1438 1815
## TCGA.CJ.5680.11A.01R.1541.07 TCGA.CJ.6030.11A.01R.1672.07
## 2 85768 66273
## 12 2094 1644
## 14 5705 10338
## 16 9369 14308
## 18 4989 14743
## TCGA.CW.5580.11A.02R.1672.07 TCGA.CW.5585.11A.01R.1541.07
## 2 162520 111561
## 12 2849 2009
## 14 9621 8669
## 16 12294 12937
## 18 8742 28314
## TCGA.CW.5587.11A.01R.1541.07 TCGA.CW.5589.11A.01R.1541.07
## 2 57992 58145
## 12 2005 1909
## 14 7388 8045
## 16 13056 11578
## 18 28450 32322
## TCGA.CW.5591.11A.01R.1541.07 TCGA.CW.6087.11A.01R.1672.07
## 2 268276 50936
## 12 3544 2215
## 14 9656 7374
## 16 8113 13435
## 18 474 11368
## TCGA.CW.6088.11A.01R.1672.07 TCGA.CW.6090.11A.01R.1672.07
## 2 124218 54273
## 12 3581 2060
## 14 10869 6421
## 16 14779 12600
## 18 2032 44607
## TCGA.CZ.4863.11A.01R.1503.07 TCGA.CZ.4864.11A.01R.1503.07
## 2 100202 100861
## 12 2499 2015
## 14 12234 8487
## 16 13471 13555
## 18 1854 36257
## TCGA.CZ.4865.11A.01R.1503.07 TCGA.CZ.5452.11A.01R.1503.07
## 2 133636 57375
## 12 2079 2036
## 14 8473 9704
## 16 8622 13255
## 18 2164 1945
## TCGA.CZ.5453.11A.01R.1503.07 TCGA.CZ.5454.11A.01R.1503.07
## 2 129106 160618
## 12 2225 3201
## 14 9980 12174
## 16 13556 19838
## 18 3538 3821
## TCGA.CZ.5455.11A.01R.1503.07 TCGA.CZ.5457.11A.01R.1503.07
## 2 132123 91025
## 12 2145 2274
## 14 10933 11980
## 16 12748 13296
## 18 1221 744
## TCGA.CZ.5458.11A.01R.1503.07 TCGA.CZ.5461.11A.01R.1503.07
## 2 125882 114075
## 12 2349 2264
## 14 14544 8929
## 16 18502 13896
## 18 2986 10012
## TCGA.CZ.5462.11A.01R.1503.07 TCGA.CZ.5463.11A.01R.1503.07
## 2 107009 94839
## 12 3000 2682
## 14 14048 8824
## 16 20243 12365
## 18 3885 6959
## TCGA.CZ.5465.11A.01R.1503.07 TCGA.CZ.5467.11A.01R.1503.07
## 2 71648 96987
## 12 2802 1589
## 14 10720 8978
## 16 15365 12244
## 18 1488 2067
## TCGA.CZ.5468.11A.01R.1503.07 TCGA.CZ.5469.11A.01R.1503.07
## 2 115785 95492
## 12 2083 2016
## 14 9351 11212
## 16 12508 13376
## 18 1471 1760
## TCGA.CZ.5470.11A.01R.1503.07 TCGA.CZ.5984.11A.01R.1672.07
## 2 175437 103163
## 12 2906 1845
## 14 12262 6737
## 16 17086 9896
## 18 2878 20253
## 11408 more rows ...
##
## $samples
## group lib.size norm.factors
## TCGA.B0.4712.01A.01R.1503.07 1 49725026 0.8058921
## TCGA.B0.5402.01A.01R.1503.07 1 62640283 0.9852068
## TCGA.B0.5690.01A.11R.1541.07 1 45633765 0.9727027
## TCGA.B0.5705.01A.11R.1541.07 1 60479323 0.9851701
## TCGA.B0.5709.01A.11R.1541.07 1 58992784 0.9797455
## 71 more rows ...
##
## $genes
## symbol txlen txgc
## 2 A2M 4844 0.4882329
## 5 SERPINA3 3067 0.5249429
## 7 AAMP 2270 0.5792952
## 9 AARS 3477 0.5349439
## 10 ABAT 5586 0.4994629
## 11408 more rows ...
v <- voom(dgenorm, design, plot=FALSE)
FDRcutoff <- 0.01
mod0 <- model.matrix(~ 1, colData(se.paired.genes))
sv <- sva(v$E, mod = design, mod0 = mod0)
## Number of significant surrogate variables is: 7
## Iteration (out of 5 ):1 2 3 4 5
design.voom <- cbind(design, sv$sv)
fit <- lmFit(v, design)
fit <- eBayes(fit)
res <- decideTests(fit, p.value = FDRcutoff)
summary(res)
## (Intercept) typetumor samplecodes4863 samplecodes4864 samplecodes4865
## -1 0 3473 4 2 8
## 0 891 4203 11408 11410 11405
## 1 10522 3737 1 1 0
## samplecodes5402 samplecodes5452 samplecodes5453 samplecodes5454
## -1 3 1 2 2
## 0 11410 11412 11411 11410
## 1 0 0 0 1
## samplecodes5455 samplecodes5457 samplecodes5458 samplecodes5461
## -1 0 3 1 2
## 0 11413 11408 11412 11411
## 1 0 2 0 0
## samplecodes5462 samplecodes5463 samplecodes5465 samplecodes5467
## -1 1 0 5 4
## 0 11411 11413 11407 11409
## 1 1 0 1 0
## samplecodes5468 samplecodes5469 samplecodes5470 samplecodes5549
## -1 1 1 0 1
## 0 11411 11411 11413 11412
## 1 1 1 0 0
## samplecodes5552 samplecodes5580 samplecodes5585 samplecodes5587
## -1 8 5 1 3
## 0 11404 11406 11412 11408
## 1 1 2 0 2
## samplecodes5589 samplecodes5591 samplecodes5636 samplecodes5680
## -1 1 1 0 5
## 0 11412 11411 11413 11408
## 1 0 1 0 0
## samplecodes5690 samplecodes5705 samplecodes5709 samplecodes5711
## -1 5 5 5 1
## 0 11408 11407 11408 11412
## 1 0 1 0 0
## samplecodes5712 samplecodes5984 samplecodes6030 samplecodes6087
## -1 6 1 1 3
## 0 11405 11411 11412 11408
## 1 2 1 0 2
## samplecodes6088 samplecodes6090
## -1 1 2
## 0 11412 11411
## 1 0 0
volcanoplot(fit, coef = 2, highlight = 7, fit$genes$symbol, main = "Model", las = 1)
colnames(design)
## [1] "(Intercept)" "typetumor" "samplecodes4863"
## [4] "samplecodes4864" "samplecodes4865" "samplecodes5402"
## [7] "samplecodes5452" "samplecodes5453" "samplecodes5454"
## [10] "samplecodes5455" "samplecodes5457" "samplecodes5458"
## [13] "samplecodes5461" "samplecodes5462" "samplecodes5463"
## [16] "samplecodes5465" "samplecodes5467" "samplecodes5468"
## [19] "samplecodes5469" "samplecodes5470" "samplecodes5549"
## [22] "samplecodes5552" "samplecodes5580" "samplecodes5585"
## [25] "samplecodes5587" "samplecodes5589" "samplecodes5591"
## [28] "samplecodes5636" "samplecodes5680" "samplecodes5690"
## [31] "samplecodes5705" "samplecodes5709" "samplecodes5711"
## [34] "samplecodes5712" "samplecodes5984" "samplecodes6030"
## [37] "samplecodes6087" "samplecodes6088" "samplecodes6090"
toptable <- topTable(fit, coef = 2, n=Inf)
par(mfrow=c(1,2), mar=c(4,5,2,2))
hist(toptable$P.Value, xlab="Raw P-values", main = "", las = 1)
qqt(fit$t[,2], df=fit$df.prior + fit$df.residual, main="", pch = ".", cex=3, ylim = c(-50, 50))
abline(0,1, lwd = 2)
se.paired.genes
## class: RangedSummarizedExperiment
## dim: 11413 76
## metadata(5): experimentData annotation cancerTypeCode
## cancerTypeDescription objectCreationDate
## assays(3): counts logCPM logCPMnorm
## rownames(11413): 2 12 ... 101340251 101340252
## rowRanges metadata column names(3): symbol txlen txgc
## colnames(76): TCGA.B0.4712.01A.01R.1503.07
## TCGA.B0.5402.01A.01R.1503.07 ... TCGA.CZ.5470.11A.01R.1503.07
## TCGA.CZ.5984.11A.01R.1672.07
## colData names(550): type bcr_patient_uuid ...
## lymph_nodes_aortic_pos_total samplecodes